library(tidyr) 
library(tidyverse)
library(dplyr) 
library(purrr)
library(ggplot2) 
library(plotly)
library(corrplot)
library(RColorBrewer)
library(lubridate)
#library(DataExplorer)

Read Data

games <- read.csv("/Users/jennyli/Desktop/VGR/video-game-reviews/clean_data/clean_data_model.csv")

Summary Statistics

summary(games$user_score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.0    63.0    73.0    69.8    80.0    97.0
summary(games$meta_reviews)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   11.00   18.00   23.94   31.00  126.00
summary(games$user_reviews)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       3      14      34     204     107  156973
summary(games$meta_score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   17.00   64.00   73.00   71.13   80.00   99.00
summary(games$years_since_released)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1889  4.7748 10.1205 10.2842 15.3696 26.8392
#Most Common
rank_platform <- sort (table(games$platform), decreasing = TRUE)
#Top 5 platforms
head(rank_platform)
## 
##            PC PlayStation 4      Xbox 360 PlayStation 2        Switch 
##          4358          1840          1449          1189          1149 
## PlayStation 3 
##          1143
#PC, PlayStation4, Xbox 360,PlayStation2, Switch, and PlayStation3

rank_publisher<- sort (table(games$publishers), decreasing = TRUE)
#Top 5 publisher
head(rank_publisher)
## 
##         Ubisoft Electronic Arts      Activision        Nintendo            Sega 
##             726             660             624             551             540 
##          Capcom 
##             460
#Ubisoft, Electronic Arts, Activision, Nintendo, Sega, and Capcom

rank_developer <- sort (table(games$developers), decreasing = TRUE)
#Top 5 developer
head(rank_developer)
## 
##           Capcom   Telltale Games        EA Sports        EA Canada 
##              280              217              176              155 
##           Konami Ubisoft Montreal 
##              148              139
# Capcom, Telltale games, EA Sports, EA Canada, Knami, Ubisoft Montreal
#Distributions/ Histogram of scores
#user score
hist(games$user_score)

#meta score
hist(games$meta_score)

#Visualizing the difference between meta score and user score
games$score_diff <- games$meta_score -games$user_score
hist(games$score_diff)

plot_ly(data = games, type = "scatter",  mode = "markers", x = ~user_score, y = ~meta_score)
## Warning: `arrange_()` was deprecated in dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
# Correlations for numeric columns 
clean_game_cor <-select(games,user_score, user_reviews, meta_score, meta_reviews, best_game, most_discussed, most_shared, score_diff, years_since_released)

M <-cor(clean_game_cor)
corrplot(M, type="upper", order="hclust",
         col=brewer.pal(n=8, name="RdYlBu"))

#Frequency of each esrb content descriptor
esrb_freq <-list("Blood",sum(games$esrb_descs_Blood), 
                 "Gambling", sum(games$esrb_descs_Gambling), 
                 "Humor", sum(games$esrb_descs_Humor),
                 "Language", sum(games$esrb_descs_Language),
                 "Nudity", sum(games$esrb_descs_Nudity),
                 "Violence", sum(games$esrb_descs_Violence),
                 "Missing", sum(games$esrb_descs_missing))

## Violence 6081,  Blood 2790, Language 2422
#summary(games)
#plot_intro(games)

#Popular Genres
#Convert release date to year
games$year_released = year(as.character(games$release_date))
genres = games %>% select(year_released, starts_with("genres_"))

#Genres type by frequency, sorted in descending order
sort_genres = data.frame(colSums(genres[2:ncol(genres)])) %>% rownames_to_column()  %>% rename(count = colSums.genres.2.ncol.genres..., genre_type = rowname) %>% arrange(-count)

#Get the top 20 most frequent genres
top20_genres = sort_genres[1:20,]

#Group top 20 by year and summarize the count
top20_genres_by_year = genres %>% select(year_released, top20_genres$genre_type) %>% group_by(year_released) %>% summarise(across(everything(), sum)) 

#Tidy data for plotly
top20_genres_by_year_tidy_count = top20_genres_by_year %>% gather(key = genre_type, value = count, 2:21)
top20_genres_by_year_tidy_count$genre_type = gsub("genres_", "", top20_genres_by_year_tidy_count$genre_type) #clean type name

#Interactive visualization with plotly (by count)
visual_top20_by_count = plot_ly(top20_genres_by_year_tidy_count, type = "bar", x = ~fct_reorder(genre_type, count), y = ~count, frame = ~year_released, showlegend = FALSE)
visual_top20_by_count
#Top 20 by percent
total_games_by_year = games %>% count(year_released) # calculate total number of games released per year
top20_genres_by_year_tidy_percent = top20_genres_by_year %>% left_join(total_games_by_year, by="year_released")
top20_genres_by_year_tidy_percent = top20_genres_by_year_tidy_percent %>% mutate(across(starts_with("genres_"), .fns = ~./n * 100))

#Tidy data for plotly
top20_genres_by_year_tidy_percent = top20_genres_by_year_tidy_percent %>% select(-n) %>% gather(key = genre_type, value = percent, starts_with("genres_"))
top20_genres_by_year_tidy_percent$genre_type = gsub("genres_", "", top20_genres_by_year_tidy_percent$genre_type) #clean type name

#Interactive visualization with plotly (by percent)
visual_top20_by_percent = plot_ly(top20_genres_by_year_tidy_percent, type = "bar", x = ~fct_reorder(genre_type, percent), y = ~percent, frame = ~year_released, showlegend = FALSE) 
visual_top20_by_percent
#ESRB Rating Trends across year
ESRB = games %>% count(year_released, esrb_ratings) %>% na.omit()
ESRB_by_year = ESRB %>% group_by(year_released) %>% summarise(sum(n)) %>% rename(total_by_year = `sum(n)`)
ESRB = ESRB %>% left_join(ESRB_by_year, by="year_released")
ESRB$percent = ESRB$n / ESRB$total_by_year
ESRB = ESRB %>% select(-n, -total_by_year)
#Colors
colors_rsrb_rating = list(
  "E" = "#FF99FF",
  "E10+" = "#FFCCFF",
  "M" = "#CC99FF",
  "T" = "#99CCFF",
  "AO" = "#6699CC",
  "K-A" = "#9999FF",
  "RP" = "#003399"
)
ESRB$color = dplyr::recode(ESRB$esrb_ratings, !!!colors_rsrb_rating)
plot_ly(ESRB, type = "pie",labels = ~esrb_ratings, values = ~percent, frame = ~year_released,
        textinfo = 'label+percent', marker = list( colors = ~color), sort = FALSE)
#Trends of ESRB content descriptions over years
esrb_content = games %>% select(year_released, starts_with("esrb_descs_"))

#Group by year and summarize the count
esrb_content_by_year = esrb_content %>% group_by(year_released) %>% summarise(across(everything(), sum)) 

#Tidy data for plotly
esrb_content_by_year_tidy_count = esrb_content_by_year %>% gather(key = esrb_content, value = count, 2:8)
esrb_content_by_year_tidy_count$esrb_content = gsub("esrb_descs_", "", esrb_content_by_year_tidy_count$esrb_content) #clean type name
esrb_content_by_year_tidy_count = esrb_content_by_year_tidy_count[!esrb_content_by_year_tidy_count$esrb_content == "missing",] #drop missing values

#Interactive visualization with plotly (by count)
visual_esrb_content_by_count = plot_ly(esrb_content_by_year_tidy_count, type = "bar", x = ~fct_reorder(esrb_content, count), y = ~count, frame = ~year_released, showlegend = FALSE)
visual_esrb_content_by_count
##missing data for esrb content descriptions from 2015-2021

#by percent
total_games_by_year = games %>% count(year_released) # calculate total number of games released per year
esrb_content_by_year_tidy_percent = esrb_content_by_year %>% left_join(total_games_by_year, by="year_released")
esrb_content_by_year_tidy_percent = esrb_content_by_year_tidy_percent %>% mutate(across(starts_with("esrb_descs_"), .fns = ~./n * 100))

#Tidy data for plotly
esrb_content_by_year_tidy_percent = esrb_content_by_year_tidy_percent %>% select(-n) %>% gather(key = esrb_content, value = percent, starts_with("esrb_descs_"))
esrb_content_by_year_tidy_percent$esrb_content = gsub("esrb_descs_", "", esrb_content_by_year_tidy_percent$esrb_content) #clean type name

#Interactive visualization with plotly (by percent)
visual_esrb_content_by_percent = plot_ly(esrb_content_by_year_tidy_percent, type = "bar", x = ~fct_reorder(esrb_content, percent), y = ~percent, frame = ~year_released, showlegend = FALSE) 
visual_esrb_content_by_percent
#Platform Trends across year
summary(games$platform)
##    Length     Class      Mode 
##     16262 character character
platform = games %>% count(year_released, platform) %>% na.omit()
platform_by_year = platform %>% group_by(year_released) %>% summarise(sum(n)) %>% rename(total_by_year = `sum(n)`)
platform = platform %>% left_join(platform_by_year, by="year_released")
platform$percent = platform$n / platform$total_by_year
platform = platform %>% select(-n, -total_by_year)

colors_platform = list(
  "PC" = "#FF99FF",
  "Nintendo 64" = "#FFCCFF",
  "PlayStation" = "#CC99FF",
  "PlayStation 2" = "#99CCFF",
  "PlayStation 3" = "#6699CC",
  "PlayStation 4" = "#9999FF",
  "PlayStation 5" = "#003399",
  "PlayStation Vita" = "#666699",
  "Dreamcast" = "#FF0000",
  "Game Boy Advance" = "#FFCC33",
  "GameCube" = "#FF9900",
  "Xbox" = "#FF6600",
  "Xbox 360" = "#CC6600",
  "Xbox One" = "#663300",
  "PSP" = "#CCFF99",
  "Wii" = "#99CC66",
  "Wii U" = "#669933",
  "DS" = "#336600",
  "3DS" = "#FFFFCC",
  "Switch" = "#660033",
  "Stadia" = "#330033"
)
platform$color = dplyr::recode(platform$platform, !!!colors_platform)
plot_ly(platform, type = "pie",labels = ~platform, values = ~percent, frame = ~year_released,
        textinfo = 'label+percent', marker = list(colors = ~color), sort = FALSE)
#Correlation b/t metascore & userscore 
Corr = cor(games$meta_score, games$user_score)
Corr 
## [1] 0.5338949
#distribution user & metacritic scores
ggplot(games) + geom_histogram(mapping = aes(x = user_score)) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(games) + geom_histogram(mapping = aes(x = meta_score)) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.